In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

In [23]:
## load data
data = pd.read_csv("data/amazon_employee_access/train.csv")
## make ids recognized as categorical data
for f in data.columns[1:]:
    data[f] = data[f].astype(np.str)
data.head(3)


Out[23]:
ACTION RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2 ROLE_DEPTNAME ROLE_TITLE ROLE_FAMILY_DESC ROLE_FAMILY ROLE_CODE
0 1 39353 85475 117961 118300 123472 117905 117906 290919 117908
1 1 17183 1540 117961 118343 123125 118536 118536 308574 118539
2 1 36724 14457 118219 118220 117884 117879 267952 19721 117880

In [25]:
## exploration session
dsession = session.Session(data, "ACTION", random_state=0)
transformers = []
print dsession.get_parameters()


{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.96, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}

In [26]:
## numerical and categorical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)


0
9

In [28]:
## knowing what you are dealing with
pd.value_counts(data.ACTION) * 1./ data.shape[0]


Out[28]:
1    0.94211
0    0.05789
dtype: float64

In [ ]: